1 Aufgabenstellung

2 Daten aufbereiten und Pakete Lesen

2.1 Pakete laden und Daten einlesen

2.1.1 Pakete laden

# Pakete für Data Wrangling und Visualisierung
library(tidyverse)
library(rsample)
library(hablar)

# Pakete für das HTML
library(bookdown)
library(knitr)

# Recommenderlab und ähnlich
library(recommenderlab)
library(vegan)
library(coop)

2.1.2 Konfiguration

# Konfiguration der Pakete
knitr::opts_chunk$set(fit.align = "left", cache = TRUE, warning = FALSE, message = FALSE)
set.seed(100)

2.1.3 Daten einlesen

# Einlesen der CSV-Dateien und erstellen der samples
movies1 <- read.csv("ml-latest-small/movies.csv", sep = ",")
links1 <- read.csv("ml-latest-small/links.csv", sep = ",")
ratings1 <- read.csv("ml-latest-small/ratings.csv", sep = ",")
tags1 <- read.csv("ml-latest-small/tags.csv", sep = ",")

# Sample von 70%
set.seed(69)
movies2 <- movies1 %>% slice_sample(prop = 0.7)
links2 <- subset(links1, movieId %in% movies2$movieId)
ratings2 <- subset(ratings1, movieId %in% movies2$movieId) %>% slice_sample(prop = 0.7)
tags2 <- subset(tags1, movieId %in% movies2$movieId)

# 2ter Sample von 70%
set.seed(100)
movies1 <- movies1 %>% slice_sample(prop = 0.7)
links1 <- subset(links1, movieId %in% movies1$movieId)
ratings1 <- subset(ratings1, movieId %in% movies1$movieId) %>% slice_sample(prop = 0.7)
tags1 <- subset(tags1, movieId %in% movies1$movieId)

3 EDA

3.1 Welches sind die am häufigsten geschauten Filme?

3.1.1 Sample 1

left_join(movies1, ratings1, "movieId") %>%
  group_by(title, movieId, genres) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(3)
## # A tibble: 3 × 4
## # Groups:   title, movieId [3]
##   title                                     movieId genres                 count
##   <chr>                                       <int> <chr>                  <int>
## 1 Forrest Gump (1994)                           356 Comedy|Drama|Romance|…   234
## 2 Pulp Fiction (1994)                           296 Comedy|Crime|Drama|Th…   200
## 3 Star Wars: Episode IV - A New Hope (1977)     260 Action|Adventure|Sci-…   181

3.1.2 Sample 2

left_join(movies2, ratings2, "movieId") %>%
  group_by(title, movieId, genres) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(3)
## # A tibble: 3 × 4
## # Groups:   title, movieId [3]
##   title                                     movieId genres                 count
##   <chr>                                       <int> <chr>                  <int>
## 1 Shawshank Redemption, The (1994)              318 Crime|Drama              228
## 2 Pulp Fiction (1994)                           296 Comedy|Crime|Drama|Th…   219
## 3 Star Wars: Episode IV - A New Hope (1977)     260 Action|Adventure|Sci-…   182

3.1.3 Conclusion

Wir können nicht bestimmen, wie oft ein Film geschaut wurde, da es zu dieser Information keine Daten gibt. Als alternative definieren wir, dass geschaut und bewertet gleichgestellt wird. Die am meist geschauten/bewerteten Filme sind “Forrest Gump”, “Pulp Fiction”, “Star Wars: Episode IV - A New Hope” und “Shawshank Redemption”).

3.2 Welches sind die am häufigsten geschauten Genres?

3.2.1 Sample 1

genres_sep1 <- movies1 %>%
  separate_rows(genres, sep = "\\|", convert = FALSE) %>%
  replace(. == "", "no genres listed")

genres_sep1 %>%
  right_join(ratings1, "movieId") %>%
  group_by(genres) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(3)
## # A tibble: 3 × 2
##   genres count
##   <chr>  <int>
## 1 Drama  20803
## 2 Comedy 19432
## 3 Action 14383

3.2.2 Sample 2

genres_sep2 <- movies2 %>%
  separate_rows(genres, sep = "\\|", convert = FALSE) %>%
  replace(. == "", "no genres listed")

genres_sep2 %>%
  right_join(ratings2, "movieId") %>%
  group_by(genres) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(3)
## # A tibble: 3 × 2
##   genres count
##   <chr>  <int>
## 1 Drama  20230
## 2 Comedy 20019
## 3 Action 14118

3.2.3 Conclusion

Die am meist geschauten/bewerteten Genres sind Drama, Comedy und Action.

3.3 Wie verteilen sich die Kundenratings gesamthaft?

3.3.1 Sample 1

# Gesamthaft
summary(ratings1$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.500   3.000   3.500   3.487   4.000   5.000
ggplot(ratings1, aes(rating)) +
  geom_bar() +
  labs(
    title = "Verteilung der Kundenratings",
    x = "Bewertung",
    y = "Anzahl Bewertungen",
    subtitle = paste("Durchschnittsbewertung: ", mean(ratings1$rating))
  ) +
  theme_classic() +
  theme(legend.position = "none")

3.3.2 Sample 2

# Gesamthaft
summary(ratings2$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.500   3.000   3.500   3.494   4.000   5.000
ggplot(ratings2, aes(rating)) +
  geom_bar() +
  labs(
    title = "Verteilung der Kundenratings",
    x = "Bewertung",
    y = "Anzahl Bewertungen",
    subtitle = paste("Durchschnittsbewertung: ", mean(ratings2$rating))
  ) +
  theme_classic() +
  theme(legend.position = "none")

3.3.3 Conclusion

Die Kundenratings sind nicht ganz normalverteilt, aber nahe. Die meisten Bewertungen sind im Bereich der natürlichen Zahlen, wenige Bewertungen sind ein Wert zwischen zwei dieser Zahlen. Öfters enthält eine Bewertung den Wert 4. Der Durchschnitt aller Bewertungen liegt bei etwa 3,5.

3.4 Wie verteilen sich die Kundenratings nach Genres?

3.4.1 Sample 1

# Nach Genres
genres_sep_ratings1 <- genres_sep1 %>%
  right_join(ratings1, "movieId")
ggplot(genres_sep_ratings1, aes(x = rating, fill = genres)) +
  geom_bar(aes(y = ..prop.., group = 1)) +
  facet_wrap(~genres) +
  labs(
    title = "Verteilung der Kundenratings nach Genre",
    x = "Bewertung",
    y = "Verteilung",
  ) +
  theme_classic() +
  theme(legend.position = "none")

3.4.2 Sample 2

# Nach Genres
genres_sep_ratings2 <- genres_sep2 %>%
  right_join(ratings2, "movieId")
ggplot(genres_sep_ratings2, aes(x = rating, fill = genres)) +
  geom_bar(aes(y = ..prop.., group = 1)) +
  facet_wrap(~genres) +
  labs(
    title = "Verteilung der Kundenratings nach Genre",
    x = "Bewertung",
    y = "Verteilung",
  ) +
  theme_classic() +
  theme(legend.position = "none")

3.4.3 Conclusion

Die Verteilung der Kundenratings ähneln sich bei vielen Kategorien der Verteilung der Gesamtmenge. Jedoch mit einigen Ausnahmen: Dokumentarfilme haben zum Beispiel überdurchschnittlich viele Bewertungen mit dem Wert 4 und unterdurchschnittlich wenig Bewertungen mit dem Wert 3 und 5. Man könnte sagen, dass Dokumentarfilme sehr konstante Ratings haben.

3.5 Wie verteilen sich die mittleren Kundenratings pro Film?

3.5.1 Sample 1.1

mean_rating_movie1 <- ratings1 %>%
  group_by(movieId) %>%
  summarise(mean_rating = mean(rating), count = n())

ggplot(mean_rating_movie1, aes(mean_rating)) +
  geom_histogram(bins = 50) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Verteilung"
  ) +
  theme_classic()

3.5.2 Sample 2.1

mean_rating_movie2 <- ratings2 %>%
  group_by(movieId) %>%
  summarise(mean_rating = mean(rating), count = n())

ggplot(mean_rating_movie2, aes(mean_rating)) +
  geom_histogram(bins = 50) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Verteilung"
  ) +
  theme_classic()

3.5.3 Conclusion 1

Da einige Filme nur wenige Bewertungen haben, liegen sehr viele Mittelwerte bei ganzen oder halben Zahlen. Deswegen gibt es bei unseren Plots einige hohe Balken.

3.5.4 Sample 1.2

ggplot(mean_rating_movie1 %>% filter(count >= 5), aes(mean_rating)) +
  geom_histogram(bins = 50) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Verteilung"
  ) +
  theme_classic()

3.5.5 Sample 2.2

ggplot(mean_rating_movie2 %>% filter(count >= 5), aes(mean_rating)) +
  geom_histogram(bins = 50) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Verteilung"
  ) +
  theme_classic()

3.5.6 Conclusion 2

Wenn man alle Filme mit weniger als 5 Bewertungen entfernt, erkennt man, dass die Bewertungen der Filme linksschief verteilt sind.

3.5.7 Sample 1.3

ggplot(mean_rating_movie1, aes(mean_rating, count, color = mean_rating)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Anzahl Bewertungen"
  ) +
  theme_classic() +
  scale_color_gradient(low = "red", high = "green") +
  theme(legend.position = "none")

3.5.8 Sample 2.3

ggplot(mean_rating_movie2, aes(mean_rating, count, color = mean_rating)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Verteilung der mittleren Kundenratings pro Film",
    x = "Durchschnittliche Bewertung",
    y = "Anzahl Bewertungen"
  ) +
  theme_classic() +
  scale_color_gradient(low = "red", high = "green") +
  theme(legend.position = "none")

3.5.9 Conclusion 3

Hier werden die gleichen Daten anders dargestellt. Man erkennt, dass desto öfters ein Film bewertet wird, desto näher liegt die durchschnittliche Bewertung bei 4. Man kann dies vielleicht begründen, indem man sagt, dass ein schlechter Film weniger geschaut und deswegen weniger bewertet wird. Jedoch können wir uns nur schwer erklären, wieso Filme mit einer Bewertung über 4 nicht so oft geschaut/bewertet werden.

3.6 Wie stark streuen die Ratings von individuellen Kunden?

3.6.1 Sample 1.1

sample_values <- sample(1:610, 4, replace = FALSE)

ratings1 %>%
  filter(userId %in% sample_values) %>%
  ggplot(., aes(rating)) +
  geom_density(aes(color = factor(userId))) +
  labs(
    title = "Streuung von Bewertungen von Kunden",
    subtitle = "random sample",
    x = "Bewertung",
    y = "Verteilung",
    color = "User ID"
  ) +
  theme_classic()

3.6.2 Sample 2.1

sample_values <- sample(1:610, 4, replace = FALSE)

ratings2 %>%
  filter(userId %in% sample_values) %>%
  ggplot(., aes(rating)) +
  geom_density(aes(color = factor(userId))) +
  labs(
    title = "Streuung von Bewertungen von Kunden",
    subtitle = "random sample",
    x = "Bewertung",
    y = "Verteilung",
    color = "User ID"
  ) +
  theme_classic()

3.6.3 Sample 1.2

sd_ratings1 <- ratings1 %>%
  group_by(userId) %>%
  summarise(SD = sd(rating), count = n())

ggplot(sd_ratings1, aes(SD, count, color = count)) +
  geom_point() +
  labs(
    title = "Standardabweichung der Ratings pro User",
    x = "Standardabweichung",
    y = "Anzahl Ratings",
    color = "Anzahl Ratings"
  ) +
  theme_classic() +
  scale_color_gradient(low = "green", high = "black") +
  theme(legend.position = "none")

ggplot(sd_ratings1, aes(SD)) +
  geom_boxplot() +
  labs(
    title = "Standardabweichung der Ratings pro User",
    x = "Standardabweichung",
    subtitle = paste("Durchschnittsstandardabweichung: ", mean(sd_ratings1$SD)),
  ) +
  theme_classic()

3.6.4 Sample 2.2

sd_ratings2 <- ratings2 %>%
  group_by(userId) %>%
  summarise(SD = sd(rating), count = n())

ggplot(sd_ratings2, aes(SD, count, color = count)) +
  geom_point() +
  labs(
    title = "Standardabweichung der Ratings pro User",
    x = "Standardabweichung",
    y = "Anzahl Ratings",
    color = "Anzahl Ratings"
  ) +
  theme_classic() +
  scale_color_gradient(low = "green", high = "black") +
  theme(legend.position = "none")

ggplot(sd_ratings2, aes(SD)) +
  geom_boxplot() +
  labs(
    title = "Standardabweichung der Ratings pro User",
    x = "Standardabweichung",
    subtitle = paste("Durchschnittsstandardabweichung: ", mean(sd_ratings1$SD)),
  ) +
  theme_classic()

3.6.5 Conclusion

Der Mittelwert der Standardabweichung der Ratings der User befindet sich um den Wert 0,9. Die Bewertungen streuen sich weniger als bei einer Normalverteilung.

3.7 Welchen Einfluss hat die Normierung der Ratings pro Kunde auf deren Verteilung?

3.7.1 Sample 1

norm_ratings1 <- ratings1 %>%
  group_by(userId) %>%
  summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
  full_join(., ratings1, by = "userId")

norm_ratings1$z_rating <- (norm_ratings1$rating - norm_ratings1$mean_rating) /
  norm_ratings1$sd_rating

ggplot(norm_ratings1, aes(z_rating)) +
  geom_density() +
  labs(
    title = "Normierte Ratings",
    x = "Z-Normiertes Rating",
    y = "Verteilung"
  ) +
  theme_classic()

sample_values1 <- sample(1:610, 4, replace = FALSE)

norm_ratings1 %>%
  filter(userId %in% sample_values) %>%
  ggplot(., aes(z_rating)) +
  geom_density(aes(color = factor(userId))) +
  labs(
    title = "Normierte Ratings von Kunden",
    subtitle = "random sample",
    x = "Normierte Bewertung",
    y = "Verteilung",
    color = "User ID"
  ) +
  theme_classic()

3.7.2 Sample 2

norm_ratings2 <- ratings2 %>%
  group_by(userId) %>%
  summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
  full_join(., ratings2, by = "userId")

norm_ratings2$z_rating <- (norm_ratings2$rating - norm_ratings2$mean_rating) /
  norm_ratings2$sd_rating

ggplot(norm_ratings2, aes(z_rating)) +
  geom_density() +
  labs(
    title = "Normierte Ratings",
    x = "Z-Normiertes Rating",
    y = "Verteilung"
  ) +
  theme_classic()

sample_values2 <- sample(1:610, 4, replace = FALSE)

norm_ratings2 %>%
  filter(userId %in% sample_values) %>%
  ggplot(., aes(z_rating)) +
  geom_density(aes(color = factor(userId))) +
  labs(
    title = "Normierte Ratings von Kunden",
    subtitle = "random sample",
    x = "Normierte Bewertung",
    y = "Verteilung",
    color = "User ID"
  ) +
  theme_classic()

3.7.3 Conclusion

Der Mittelwert der Bewertungen pro User befindet sich jetzt bei 0. Alle Ratings unter 0 könnte man als “gefällt dem User nicht” interpretieren und alle Rating über 0 könnte man als “gefällt dem User” interpretieren. Desto weiter sich die Bewertung von 0 entfernt desto mehr oder weniger gefällt dem User der Film.

3.8 Welche strukturellen Charakteristika (z.B.Sparsity) und Auffälligkeiten zeigt die User-Item Matrix?

3.8.1 Sample 1

user_item1 <- norm_ratings1 %>%
  select(movieId, userId, z_rating) %>%
  pivot_wider(names_from = movieId, values_from = z_rating)

sum(is.na(user_item1)) / (dim(user_item1)[1] * (dim(user_item1)[2]))
## [1] 0.9863269

3.8.2 Sample 2

user_item2 <- norm_ratings2 %>%
  select(movieId, userId, z_rating) %>%
  pivot_wider(names_from = movieId, values_from = z_rating)

sum(is.na(user_item2)) / (dim(user_item2)[1] * (dim(user_item2)[2]))
## [1] 0.9865087

3.8.3 Conclusion

Die User-Item Matrix ist zu 98.6 % Sparse.

4 Datenreduktion

Die Daten wurden auf 400 Kunden und 700 Filme reduziert, indem Filme und Kunden mit sehr wenigen Ratings entfernt wurden

# Filter 700 most rated movies
top_n_movies1 <- norm_ratings1 %>%
  group_by(movieId) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(700)

# Join data on 700 most rated movies
user_item_r1 <-
  left_join(
    top_n_movies1,
    norm_ratings1,
    by = "movieId"
  )

# Filter 700 most rated user
top_n_user1 <- user_item_r1 %>%
  group_by(userId) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(400) %>%
  ungroup()

# Join data on 400 most rated user (only 700 movies)
user_item_r1 <-
  left_join(
    top_n_user1,
    user_item_r1,
    by = "userId"
  ) %>%
  select(userId, movieId, z_rating)

# Pivot wider
m_user_item_r1 <- user_item_r1 %>%
  pivot_wider(names_from = movieId, values_from = z_rating) %>%
  column_to_rownames(., var = "userId")
# Filter 700 most rated movies
top_n_movies2 <- norm_ratings2 %>%
  group_by(movieId) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(700)

# Join data on 700 most rated movies
user_item_r2 <-
  left_join(
    top_n_movies2,
    norm_ratings2,
    by = "movieId"
  )

# Filter 700 most rated user
top_n_user2 <- user_item_r2 %>%
  group_by(userId) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(400) %>%
  ungroup()

# Join data on 400 most rated user (only 700 movies)
user_item_r2 <-
  left_join(
    top_n_user2,
    user_item_r2,
    by = "userId"
  ) %>%
  select(userId, movieId, z_rating)

# Pivot wider
m_user_item_r2 <- user_item_r2 %>%
  pivot_wider(names_from = movieId, values_from = z_rating) %>%
  column_to_rownames(., var = "userId")

4.1 Sparsity nach Datenreduktion

# Sparsity Sample 1
sum(is.na(m_user_item_r1))/(dim(m_user_item_r1)[1]*(dim(m_user_item_r1)[2]))
## [1] 0.9009821
# Sparsity Sample 2
sum(is.na(m_user_item_r2))/(dim(m_user_item_r2)[1]*(dim(m_user_item_r2)[2]))
## [1] 0.9030429

4.2 Mittlere Kundenratings pro Film vor und nach Datenreduktion

# Sample 1
moviemeans_reducted1 <-colMeans(m_user_item_r1, na.rm = TRUE)
moviemeans_reducted1 <-data.frame(moviemeans_reducted1)
ggplot(moviemeans_reducted1,aes(moviemeans_reducted1))+
  geom_density()+
  labs(title = "Streuung von durchschnittlichen Bewertung von Filmen",
       subtitle = "reduzierter Datensatz 1",
       x = "durchschnittliche Bewertung",
       y = "Verteilung")+
  theme_classic()

# Sample 2
moviemeans_reducted2 <-colMeans(m_user_item_r2, na.rm = TRUE)
moviemeans_reducted2 <-data.frame(moviemeans_reducted2)
ggplot(moviemeans_reducted2,aes(moviemeans_reducted2))+
  geom_density()+
  labs(title = "Streuung von durchschnittlichen Bewertung von Filmen",
       subtitle = "reduzierter Datensatz 2",
       x = "durchschnittliche Bewertung",
       y = "Verteilung")+
  theme_classic()

moviemeans1 <-colMeans(user_item1,na.rm = TRUE)
moviemeans1 <-data.frame(moviemeans1)
ggplot(moviemeans1,aes(moviemeans1))+
  geom_density()+
  labs(title = "Streuung von durchschnittlichen Bewertung von Filmen",
       subtitle = "kompletter Datensatz",
       x = "durchschnittliche Bewertung",
       y = "Verteilung")+
  theme_classic()

moviemeans2 <-colMeans(user_item2,na.rm = TRUE)
moviemeans2 <-data.frame(moviemeans2)
ggplot(moviemeans2,aes(moviemeans2))+
  geom_density()+
  labs(title = "Streuung von durchschnittlichen Bewertung von Filmen",
       subtitle = "kompletter Datensatz",
       x = "durchschnittliche Bewertung",
       y = "Verteilung")+
  theme_classic()

4.3 Quantifiziere “Intersection over Union” der Ratings der unterschiedlich reduzierten Datensätze.

intersection <- nrow(inner_join(user_item_r1,user_item_r2, by = c("movieId","userId")))
union <- nrow(user_item_r1) + nrow(user_item_r2) - intersection
intersection / union
## [1] 0.294602

5 Analyse Ähnlichkeitsmatrix

5.1 Zerlege den reduzierten MovieLense Datensatz in ein disjunktes Trainings- und Testdatenset im Verhältnis 4:1

#sample 1
set.seed(69) 
split1 <- initial_split(m_user_item_r1, prop = 0.80)
training1 <- as.matrix(training(split1))
test1 <- as.matrix(testing(split1))
set.seed(100) 
#sample 2 
split2 <- initial_split(m_user_item_r1, prop = 0.80)
training2 <- as.matrix(training(split2))
test2 <- as.matrix(testing(split2))

5.2 Trainiere ein IBCF Modell mit 30 Nachbarn und Cosine Similarity

#sample 1
IBCF1 <- Recommender(as(training1, "realRatingMatrix"), "IBCF",
  param = list(normalize = NULL, method = "cosine", k = 30)
)

p1 <- predict(IBCF1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
#sample 2
IBCF2 <- Recommender(as(training2, "realRatingMatrix"), "IBCF",
                        param=list(normalize = NULL, method="cosine",k = 30))
p2 <- predict(IBCF2, as(test2, "realRatingMatrix"), type="topNList",n = 15)

5.3 Bestimme die Verteilung der Filme, welche bei IBCF für paarweise Ähnlichkeitsvergleiche verwendet werden

#sample 1
moviesIBCF1<- table(unlist(as(p1,"list"))) %>%
  as.data.frame()%>%
  rename(movieId = Var1)%>%
  arrange(desc(Freq))

ggplot(moviesIBCF1,aes(Freq))+
  geom_histogram(bins =30)

#sample 2
moviesIBCF2<- table(unlist(as(p2,"list"))) %>%
  as.data.frame()%>%
  rename(movieId = Var1)%>%
  arrange(desc(Freq))


ggplot(moviesIBCF2,aes(Freq))+
  geom_histogram(bins =30)

5.4 Bestimme die Filme, die am häufigsten in der Cosine-Ähnlichkeitsmatrix auftauchen und analysiere deren Vorkommen und Ratings im reduzierten Datensatz. (Nur Neighbours oder alle?)

# Sample 1
# calculate Similarity Matrix
sim_matrix1 <- as.matrix(similarity(as(training1, "realRatingMatrix"), method = "cosine", which = "items",))
# row index (movieId) to a separate column
row_count1 <- colSums(!is.na(sim_matrix1))
row_count1 <- as.data.frame(row_count1)
row_count1 <- cbind(movieId = rownames(row_count1), row_count1)%>%
  arrange(desc(row_count1))
rownames(row_count1) <- 1:nrow(row_count1)
rownames(row_count1) <- NULL
# Sample 2
# calculate Similarity Matrix
sim_matrix2 <- as.matrix(similarity(as(training2, "realRatingMatrix"), method = "cosine", which = "items",))
# row index (movieId) to a separate column
row_count2 <- colSums(!is.na(sim_matrix2))
row_count2 <- as.data.frame(row_count2)
row_count2 <- cbind(movieId = rownames(row_count2), row_count2)%>%
  arrange(desc(row_count2))
rownames(row_count2) <- 1:nrow(row_count2)
rownames(row_count2) <- NULL
# sample 1
moviessmallIBCF1 <- head(row_count1,30)%>%
  convert(int(movieId))
moviessmallIBCF1<- left_join(moviessmallIBCF1,norm_ratings1) %>%
  group_by(movieId)%>%
  summarise(count = n(),
            mean = mean(z_rating))
# sample 2
moviessmallIBCF2 <- head(row_count2,30)%>%
  convert(int(movieId))
moviessmallIBCF2 <- left_join(moviessmallIBCF2 ,norm_ratings2) %>%
  group_by(movieId)%>%
  summarise(count = n(),
            mean = mean(z_rating))

5.4.1 Häufigkeit der Filme

# sample 1
ggplot(moviessmallIBCF1,aes(count))+
  geom_histogram()

# sample 2
ggplot(moviessmallIBCF2,aes(count))+
  geom_histogram()

5.4.2 Durchschnittliche Ratings der Filme

# sample 1
ggplot(moviessmallIBCF1,aes(mean))+
  geom_density()

# sample 2
ggplot(moviessmallIBCF2,aes(mean))+
  geom_density()

6 Implementierung Ähnlichkeitsmatrix

6.1 Berechnung der cosinus/jaccard similarity

calculate_jaccard <- function(arr1, arr2) {
  # Check which columns are available
  vals <- (!is.na(array(arr1)) & !is.na(array(arr2)))
  # Remove movieId column from jaccard similarity
  vals[1] <- FALSE
  # If there are common not na values, calculate jac sim
  if (sum(vals) != 0) {
    both_true <- arr1[vals] & arr2[vals]
    either_true <- arr1[vals] | arr2[vals]
    jac_sim <- sum(both_true) / sum(either_true)
    return(jac_sim)
  }
  # If not, return NA
  return(NA)
}

calculate_cos <- function(arr1, arr2) {
  # Check which columns are available
  vals <- (!is.na(array(arr1)) & !is.na(array(arr2)))
  # Remove movieId column from cos similarity
  vals[1] <- FALSE
  # If there are common not na values, calculate cos sim
  if (sum(vals) != 0) {
    arr1 = arr1[vals]
    arr2 = arr2[vals]
    ab <- crossprod(arr1, arr2)
    norma <- norm(arr1, type = '2')
    normb <- norm(arr2, type = '2')
    cos_sim <- ((ab / (norma * normb)) + 1) / 2
    return(cos_sim)
  }
  # If not, return NA
  return(NA)
}

6.2 Berechnung der Ähnlichkeitsmatrix

getCorrelationMatrix <- function(data, cos = TRUE) {
  # Get array with movieId's
  movies <- as.character(data$movieId)

  # Create correlation matrix and set diag to 1
  correlations <- matrix(
    NA,
    nrow = length(movies),
    ncol = length(movies),
    dimnames = list(movies, movies)
  )
  diag(correlations) <- 1

  # Iterate through every movie and preload column
  i_counter <- 0
  for (i in movies) {
    i_counter <- i_counter + 1
    row_i <- data %>% filter(movieId == i)
    # For every movie, iterate through every movie
    j_counter <- 0
    for (j in movies) {
      j_counter <- j_counter + 1
      # If cos similarity was already calculated, skip, else continue
      if (i_counter <= j_counter) {
        # calculate similarity
        row_j <- data %>% filter(movieId == j)
        if (cos) {
          sim <- calculate_cos(row_i, row_j)
        } else {
          sim <- calculate_jaccard(row_i, row_j)
        }
        # set sim in sim matrix
        correlations[i, j] <- sim
        correlations[j, i] <- sim
      }
    }
    # Track progress
    # print(paste(i_counter, " Datasets done"))
  }
  # Return correlation matrix
  return(correlations)
}

numToBool <- function(x) (x >= 0)

6.3 Test der Funktionen

6.3.1 Sample 1

# Erstellung der User-Rating Matrix
set.seed(100)
sample_values1 <- sample(1:6819, 300, replace = FALSE)

norm_ratings1 <- ratings1 %>%
  group_by(userId) %>%
  summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
  full_join(., ratings1, by = "userId")

norm_ratings1$z_rating <- (norm_ratings1$rating - norm_ratings1$mean_rating) /
  norm_ratings1$sd_rating

item_user_random_100_1 <- norm_ratings1 %>%
  select(movieId, userId, z_rating) %>%
  pivot_wider(names_from = userId, values_from = z_rating) %>%
  filter(movieId %in% sample_values1) %>%
  head(100)

item_user_random_100_bool1 <- item_user_random_100_1 %>% mutate(across(!matches("movieId"), numToBool))
# Erstellung der User-Rating Like-Dislike Matrix

corrNumb1 <- getCorrelationMatrix(item_user_random_100_1, cos = TRUE)
corrBool1 <- getCorrelationMatrix(item_user_random_100_bool1, cos = FALSE)

6.3.2 Sample 2

# Erstellung der User-Rating Matrix
set.seed(100)
sample_values2 <- sample(1:6819, 300, replace = FALSE)

norm_ratings2 <- ratings2 %>%
  group_by(userId) %>%
  summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
  full_join(., ratings2, by = "userId")

norm_ratings2$z_rating <- (norm_ratings2$rating - norm_ratings2$mean_rating) /
  norm_ratings2$sd_rating

item_user_random_100_2 <- norm_ratings2 %>%
  select(movieId, userId, z_rating) %>%
  pivot_wider(names_from = userId, values_from = z_rating) %>%
  filter(movieId %in% sample_values1) %>%
  head(100)

item_user_random_100_bool2 <- item_user_random_100_2 %>% mutate(across(!matches("movieId"), numToBool))
# Erstellung der User-Rating Like-Dislike Matrix

corrNumb2 <- getCorrelationMatrix(item_user_random_100_2, cos = TRUE)
corrBool2 <- getCorrelationMatrix(item_user_random_100_bool2, cos = FALSE)

6.4 Vergleich mit recommenderlabs

6.4.1 Sample 1

item_user_random_100_recommenderlab1 <- item_user_random_100_1 %>%
  column_to_rownames(., var = "movieId") %>%
  as.matrix(.) %>%
  t(.)

corrNumbRL1 <- as.matrix(similarity(as(item_user_random_100_recommenderlab1, "realRatingMatrix"), method = "cosine", which = "items"))

corrNumbRL1[1:6,1:6]
##           2478        3273       457       223        2366      3386
## 2478        NA 0.387965776 0.1878327 0.6764612 0.583930818 0.2678179
## 3273 0.3879658          NA 0.2105259 0.4300789 0.003323487 0.3019346
## 457  0.1878327 0.210525910        NA 0.5498116 0.506941205 0.6544472
## 223  0.6764612 0.430078886 0.5498116        NA 0.473340847 0.5391478
## 2366 0.5839308 0.003323487 0.5069412 0.4733408          NA 0.8759123
## 3386 0.2678179 0.301934611 0.6544472 0.5391478 0.875912271        NA
corrNumb1[1:6,1:6]
##           2478       3273       457       223       2366      3386
## 2478 1.0000000 0.38796536 0.1878325 0.6764613 0.58393080 0.2678174
## 3273 0.3879654 1.00000000 0.2105260 0.4300789 0.00332348 0.3019346
## 457  0.1878325 0.21052604 1.0000000 0.5498115 0.50694134 0.6544472
## 223  0.6764613 0.43007890 0.5498115 1.0000000 0.47334097 0.5391479
## 2366 0.5839308 0.00332348 0.5069413 0.4733410 1.00000000 0.8759123
## 3386 0.2678174 0.30193458 0.6544472 0.5391479 0.87591235 1.0000000
item_user_random_100_bool_recommenderlab1 <- item_user_random_100_bool1 %>%
  column_to_rownames(., var = "movieId") %>%
  t(.)

corrBoolRL1 <- as.matrix(similarity(as(item_user_random_100_bool_recommenderlab1, "realRatingMatrix"), method = "jaccard", which = "items"))

corrBoolRL1[1:6,1:6]
##           2478 3273       457       223 2366      3386
## 2478        NA 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000   NA 0.5000000 0.2500000 0.00 0.5000000
## 457  0.1428571 0.50        NA 0.6521739 0.40 0.5714286
## 223  0.3333333 0.25 0.6521739        NA 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000   NA 1.0000000
## 3386 0.0000000 0.50 0.5714286 0.4285714 1.00        NA
corrBool1[1:6,1:6]
##           2478 3273       457       223 2366      3386
## 2478 1.0000000 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000 1.00 0.5000000 0.2500000 0.00 0.5000000
## 457  0.1428571 0.50 1.0000000 0.6521739 0.40 0.5714286
## 223  0.3333333 0.25 0.6521739 1.0000000 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000 1.00       NaN
## 3386 0.0000000 0.50 0.5714286 0.4285714  NaN 1.0000000

6.4.2 Sample 2

item_user_random_100_recommenderlab2 <- item_user_random_100_2 %>%
  column_to_rownames(., var = "movieId") %>%
  as.matrix(.) %>%
  t(.)

corrNumbRL2 <- as.matrix(similarity(as(item_user_random_100_recommenderlab2, "realRatingMatrix"), method = "cosine", which = "items"))

corrNumbRL2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223         NA 0.5379408 0.6370851 0.8101851 0.5567510 0.5837510
## 2529 0.5379408        NA 0.4592246 0.7173766 0.6348775 0.5151517
## 2478 0.6370851 0.4592246        NA 0.3956517 0.6179636 0.3160116
## 2329 0.8101851 0.7173766 0.3956517        NA 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179636 0.4738899        NA 0.6549095
## 3273 0.5837510 0.5151517 0.3160116 0.3890170 0.6549095        NA
corrNumb2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223  1.0000000 0.5379408 0.6370851 0.8101851 0.5567510 0.5837511
## 2529 0.5379408 1.0000000 0.4592246 0.7173767 0.6348775 0.5151518
## 2478 0.6370851 0.4592246 1.0000000 0.3956517 0.6179635 0.3160112
## 2329 0.8101851 0.7173767 0.3956517 1.0000000 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179635 0.4738899 1.0000000 0.6549095
## 3273 0.5837511 0.5151518 0.3160112 0.3890170 0.6549095 1.0000000
item_user_random_100_bool_recommenderlab2 <- item_user_random_100_bool2 %>%
  column_to_rownames(., var = "movieId") %>%
  t(.)

corrBoolRL2 <- as.matrix(similarity(as(item_user_random_100_bool_recommenderlab2, "realRatingMatrix"), method = "jaccard", which = "items"))

corrBoolRL2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223         NA 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667        NA 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000        NA 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000        NA 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111        NA 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000        NA
corrBool2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223  1.0000000 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667 1.0000000 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000 1.0000000 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000 1.0000000 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111 1.0000000 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000 1.0000000

6.5 Vergleich mit coop (cosine) und vegan (jaccard)

6.5.1 Sample 1

corrNumbC1 <- coop::cosine(item_user_random_100_recommenderlab1, use = 'everything')

corrNumbC1[1:6,1:6]
##      2478 3273 457 223 2366 3386
## 2478    1   NA  NA  NA   NA   NA
## 3273   NA    1  NA  NA   NA   NA
## 457    NA   NA   1  NA   NA   NA
## 223    NA   NA  NA   1   NA   NA
## 2366   NA   NA  NA  NA    1   NA
## 3386   NA   NA  NA  NA   NA    1
corrNumb1[1:6,1:6]
##           2478       3273       457       223       2366      3386
## 2478 1.0000000 0.38796536 0.1878325 0.6764613 0.58393080 0.2678174
## 3273 0.3879654 1.00000000 0.2105260 0.4300789 0.00332348 0.3019346
## 457  0.1878325 0.21052604 1.0000000 0.5498115 0.50694134 0.6544472
## 223  0.6764613 0.43007890 0.5498115 1.0000000 0.47334097 0.5391479
## 2366 0.5839308 0.00332348 0.5069413 0.4733410 1.00000000 0.8759123
## 3386 0.2678174 0.30193458 0.6544472 0.5391479 0.87591235 1.0000000
corrBoolVG1 <- vegdist(item_user_random_100_bool_recommenderlab1 %>% t(.), method = "jaccard", na.rm = TRUE) %>% 
  as.matrix(.)

corrBoolVG1[1:6,1:6]
##           2478 3273       457       223 2366      3386
## 2478 0.0000000 1.00 0.8571429 0.6666667 1.00 1.0000000
## 3273 1.0000000 0.00 0.5000000 0.7500000 1.00 0.5000000
## 457  0.8571429 0.50 0.0000000 0.3478261 0.60 0.4285714
## 223  0.6666667 0.75 0.3478261 0.0000000 0.75 0.5714286
## 2366 1.0000000 1.00 0.6000000 0.7500000 0.00       NaN
## 3386 1.0000000 0.50 0.4285714 0.5714286  NaN 0.0000000
corrBool1[1:6,1:6]
##           2478 3273       457       223 2366      3386
## 2478 1.0000000 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000 1.00 0.5000000 0.2500000 0.00 0.5000000
## 457  0.1428571 0.50 1.0000000 0.6521739 0.40 0.5714286
## 223  0.3333333 0.25 0.6521739 1.0000000 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000 1.00       NaN
## 3386 0.0000000 0.50 0.5714286 0.4285714  NaN 1.0000000

6.5.2 Sample 2

corrNumbC2 <- coop::cosine(item_user_random_100_recommenderlab2, use = 'everything')

corrNumbC2[1:6,1:6]
##      223 2529 2478 2329 3052 3273
## 223    1   NA   NA   NA   NA   NA
## 2529  NA    1   NA   NA   NA   NA
## 2478  NA   NA    1   NA   NA   NA
## 2329  NA   NA   NA    1   NA   NA
## 3052  NA   NA   NA   NA    1   NA
## 3273  NA   NA   NA   NA   NA    1
corrNumb2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223  1.0000000 0.5379408 0.6370851 0.8101851 0.5567510 0.5837511
## 2529 0.5379408 1.0000000 0.4592246 0.7173767 0.6348775 0.5151518
## 2478 0.6370851 0.4592246 1.0000000 0.3956517 0.6179635 0.3160112
## 2329 0.8101851 0.7173767 0.3956517 1.0000000 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179635 0.4738899 1.0000000 0.6549095
## 3273 0.5837511 0.5151518 0.3160112 0.3890170 0.6549095 1.0000000
corrBoolVG2 <- vegdist(item_user_random_100_bool_recommenderlab2 %>% t(.), method = "jaccard", na.rm = TRUE) %>% 
  as.matrix(.)

corrBoolVG2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223  0.0000000 0.5833333 0.6000000 0.1764706 0.4736842 0.7500000
## 2529 0.5833333 0.0000000 0.6000000 0.3750000 0.3750000 0.5000000
## 2478 0.6000000 0.6000000 0.0000000 0.7500000 0.6666667 1.0000000
## 2329 0.1764706 0.3750000 0.7500000 0.0000000 0.3888889 0.7777778
## 3052 0.4736842 0.3750000 0.6666667 0.3888889 0.0000000 0.8000000
## 3273 0.7500000 0.5000000 1.0000000 0.7777778 0.8000000 0.0000000
corrBool2[1:6,1:6]
##            223      2529      2478      2329      3052      3273
## 223  1.0000000 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667 1.0000000 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000 1.0000000 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000 1.0000000 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111 1.0000000 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000 1.0000000

6.6 Vergleich der Korrelationsmatrizen

Die Korrelationsmatrix mit ordinalen Ratings scheint viel detailliertere Korrelationswerte zurückzugeben, da wir genaue Ratings der User haben. Da mit der Umwandlung zu binären Werten diese Informationen verloren gehen, sieht die Korrelationsmatrix mit binären Werten dementsprechend weniger hochauflösend aus.